import timeit
import scanpy as sc
import anndata as ad
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import warnings
from copy import deepcopy
import timeit
from random import choice
import time
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from itertools import cycle, islice
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
# distance calculator
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
we generated different datasets to test our implementations and to compare them with the sklearn implementations.
n_samples=1000 #Size of Dataset, variable
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,noise=.05)
points_noisy_circles=noisy_circles[0]
f1=points_noisy_circles[0:((n_samples)), 0]
f2=points_noisy_circles[0:((n_samples)),1]
plt.scatter(f1, f2, c='black', s=7)
plt.title('Noisy circle')
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
points_noisy_moons=noisy_moons[0]
f1=points_noisy_moons[0:((n_samples)), 0]
f2=points_noisy_moons[0:((n_samples)),1]
plt.scatter(f1, f2, c='black', s=7)
plt.title('Noisy moons')
no_structure = np.random.rand(n_samples, 2)
f1=no_structure[0:((n_samples)), 0]
f2=no_structure[0:((n_samples)),1]
plt.title('No structure')
plt.scatter(f1, f2, c='black', s=7)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=15)
points_blobs=blobs[0]
f1=points_blobs[0:((n_samples)), 0]
f2=points_blobs[0:((n_samples)),1]
plt.scatter(f1, f2, c='black', s=7)
plt.title('Blobs')
X = np.array(list(zip(f1, f2)))
Initialization: Choose initial centroids or choose k samples from the dataset X
Looping between the following steps:
the algorithm repeats these last two steps until there is no difference between the old and the new centroids
k=3
Indecies_centroid = np.random.choice(n_samples,size=k)
C=X[Indecies_centroid]
plt.scatter(f1,f2,c='black',s=7)
plt.scatter(C[:, 0], C[:, 1],marker='*',c='red',s=100)
plt.title('Blobs with random centers')
for i in range(len(X)):
distances = dist(C,[X[i]])
clusters = np.zeros(len(X)) #Generation of an empty array that will be overwritten in for loop
for i in range(len(X)):
distances = dist(C,[X[i]])
cluster = np.argmin(distances)
clusters[i] = cluster
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
C[i] = np.mean(points, axis=0)
print(C)
clusters = np.zeros(len(X))
C_old = np.zeros(C.shape)
update_centroids = dist(C, C_old) # Konvergenz
while update_centroids.all() != 0:
for i in range(len(X)):
distances = dist(C,[X[i]])
cluster = np.argmin(distances)
clusters[i] = cluster
C_old = deepcopy(C) # der alte Wert muss zwischengespeichert werden, damit Veränderung von C_od zu C (new) berrechnet werden kann
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
C[i] = np.mean(points, axis=0)
update_centroids = dist(C, C_old)
fig, ax = plt.subplots()
for i in range(k):
points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
ax.scatter(points[:, 0], points[:, 1], s=7)
ax.scatter(C[:, 0], C[:, 1], marker='*', c='black', s=100)
plt.title('Blobs clustered with kmeans')
C2 = np.random.random((1,2))
print(C2)
with x e X with probability Dx^2/(summe Dx^2) D(x) denotes the shortest distance from a data point to the closest center we have already chosen
Choose one new data point at random as a new center, using a weighted probability distribution where a point x is chosen with probability proportional to D(x)2
Preparation
# distance calculator
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
#"Kmeans part"
clusters = np.zeros(len(X))
C_old = np.zeros(C2.shape)
error = dist(C2, C_old) # convergence
#"Kmeans++ part"
data_2 = pd.DataFrame(X)
data_long = data_2.reset_index().melt(id_vars="index") #add index from probability distribution to data
a = np.array(data_long)
index =a[0:n_samples,0] #define index
Liste =[] #list for minimal distances
print(C2)
k=3 #define number of clusters
n=0
while n < k - 1:
n=n+1
Liste =[]
for i in range (len(X)):
diff=X[i]-C2
dist = np.linalg.norm(diff,axis=1) # calculate the distance
min_dist= np.min(dist)
Liste.append(min_dist)
s= np.array(Liste)
dist_2 = (s)**2
prob = (dist_2)/ sum (dist_2) #p-distribution
z=np.random.choice(index, p=prob) #add index of new cluster to data
c_neu=X[z] #define datapoint with index z as new center
cl = np.ndarray.tolist(C2) #convert old array to list to be able to add new values
c_neul = np.ndarray.tolist (c_neu) #convert new value to list
cl.append (c_neul) # add new centers to center list
C2 = np.array(cl) #convert list to array
print(C2)
cx1= C2[0:((k)),0]
cy1 = C2[0:((k)),1]
plt.scatter(f1,f2,c='black',s=7)
plt.scatter(cx1,cy1,marker='*',c='red',s=100)
plt.title('Blobs with centers defined by kmeans++ algorithm')
while error.all() != 0:
for i in range(len(X)):
diff2=X[i]-C2
distances = np.linalg.norm(diff2,axis=1) # calculate the distance
cluster = np.argmin(distances)
clusters[i] = cluster
C_old = deepcopy(C2)
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
C2[i] = np.mean(points, axis=0)
diff3 = C2-C_old
error = np.linalg.norm(diff3,axis=1) # calculate the distance
colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for i in range(k):
points = np.array([X[j] for j in range(len(X)) if clusters[j] == i])
ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
plt.scatter(cx1,cy1,marker='*',c='black',s=100)
plt.title('Blobs clustered with kmeans++')
a = np.array(list(zip(f1, f2)))
start1 = time.time()
numrows = np.size(a, 0)
numcols = np.size(a, 1)
end1 = time.time()
batch_s = 100
i = 0
neurows = numrows
# in order to be able to create a batch generator matrix, length of the data (numrows) is reduced accordingly
while (numrows / batch_s) != (numrows // batch_s):
numrows = numrows - i
i += 1
print(numrows)
print(numrows / batch_s)
# batch generator matrix = batches
batches = np.arange(numrows).reshape(batch_s, (numrows // batch_s))
a[batches[0]]
f1= a[:, 0]
f2= a[:, 1]
plt.scatter(f1, f2, c='green', s=10)
X_mini = (list(zip(f1, f2)))
k=2
c1 = np.random.choice(f1, size=k)
c2 = np.random.choice(f2, size=k)
plt.scatter(c1, c2, marker='*',c='red',s=100)
centroid_array = np.array([[np.random.choice(f1, size=k)],
[np.random.choice(f2, size=k)]])
print (centroid_array)
def dist(cell_loc, cluster_number):
return np.linalg.norm(a[cell_loc, :] - centroid_array[cluster_number - 1, :])
def assign_centroids(a_array):
global nearest_centroid
z = 0
nearest_centroid = np.zeros([numrows, 1])
# Loop over all datapoints
while z < numrows:
sml_distance = -1
# Loop over every centroid
j = 1
while j <= k:
if sml_distance == -1 or dist(z, j) < sml_distance:
sml_distance = dist(z, j)
nearest_centroid[z, 0] = j
j += 1
z += 1
n_iterations = (numrows // batch_s)
v = np.zeros((k, 1))
j = 1
ctrnew = centroid_array
while (j <= n_iterations):
batch = (a[batches[j-1]]) # implementation of j-1 batches with the help of batch_generator matrix
assign_centroids(batch)
z = 0
while (z < batch_s):
c = ctrnew[int(nearest_centroid[z, 0])-1, :]
v[int((nearest_centroid[z, 0]-1)), 0] = int(v[int((nearest_centroid[z, 0]-1)), 0]) + 1
n = 1/v[int((nearest_centroid[z, 0]-1)), 0]
ctrnew[int(nearest_centroid[z, 0])-1, :] = c * (1-n) + a[z, :] * n
z+=1
j+=1
centroid_array = ctrnew
assign_centroids(a)
nearest_centroid_squeeze = np.squeeze(nearest_centroid.astype(int))
plt.scatter(f1, f2, c='green', s=15)
X_mini = (list(zip(f1, f2)))
plt.scatter(ctrnew[0,:],ctrnew[1,:], marker='*',c='red',s=100)
1. Kmeans
kmeans = KMeans(n_clusters=3).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=kmeans,s=7)
plt.title('Blobs clustered with scitlearn kmeans')
2. Kmeans++
kmeans = KMeans(n_clusters=3,init='k-means++').fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=kmeans,s=7)
plt.title('Blobs clustered with scitlearn kmeans++')
3. Minibatch
k=3
batch_s=100
kmeans = MiniBatchKMeans(n_clusters=k,
random_state=0,
batch_size=batch_s)
i = 0
while (i < n_iterations):
kmeans = kmeans.partial_fit(a[(0 + batch_s * i):(batch_s + batch_s * i),:])
i+=1
kmeans.cluster_centers_
plt.scatter(f1, f2, c='green', s=15)
X = (list(zip(f1, f2)))
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1] , marker='*',c='red',s=100)
X = np.array(list(zip(f1, f2)))
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
Own implementation
k_all=(2,3,5) # hier kann man die Anzahl an k, die man vergleichen will, einfach eingeben
WSS_own = np.zeros(len(k_all))
for l in range(len(k_all)):
k=k_all[l]
times_to_repeat = 10
sum_WSS = 0
for j in range (times_to_repeat):
start_time = timeit.default_timer()
Indecies_centroid = np.random.choice(n_samples,size=k) # for initialization we choose some random datapoint in order to avoid that no data points were assigned to random centroid
C=X[Indecies_centroid]
clusters = np.zeros(len(X))
C_old = np.zeros(C.shape)
error = dist(C, C_old) # Konvergenz
from copy import deepcopy
while error.all() != 0:
for i in range(len(X)):
distances = dist(C,[X[i]])
cluster = np.argmin(distances)
clusters[i] = cluster
C_old = deepcopy(C)
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
C[i] = np.mean(points, axis=0)
error = dist(C, C_old)
WSS=0
for j in range(len(X)):
WSS += dist(X[j],C[int(clusters[j])], ax = 0)**2
sum_WSS=sum_WSS+WSS
average_WSS= (sum_WSS/times_to_repeat)
WSS_own[l]=average_WSS
print(WSS_own)
Scitlearn implementation
n_clusters=(2,3,5)
WSS_scikit = np.zeros(len(n_clusters))
for l in range(len(n_clusters)):
times_to_repeat = 10
sum_WSS = 0
for i in range (times_to_repeat):
start_time = timeit.default_timer()
kmeans = KMeans(n_clusters[l])
a = kmeans.fit_predict(X)
kmeans.inertia_
sum_WSS=sum_WSS+kmeans.inertia_
average_WSS= sum_WSS/times_to_repeat
WSS_scikit[l]=average_WSS
print(WSS_scikit)
Plot
## hab hier alles so angepasst, dass Beschriftungen alle automatisch passen, je nach dem wie man n_samples, k etc verändert hat
# data to plot
n_groups = len(k_all)
means_own = (WSS_own)
means_scikit = (WSS_scikit)
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8
rects1 = plt.bar(index, means_own, bar_width,
alpha=opacity,
color='b',
label='own implementation')
rects2 = plt.bar(index + bar_width, means_scikit, bar_width,
alpha=opacity,
color='g',
label='Scikit learn implementation')
plt.xlabel('k')
plt.ylabel('average sum of squared distances within all clusters')
plt.title('quality comparison, blobs , n=' + str(n_samples))
plt.xticks(index + bar_width, (k_all))
plt.legend()
plt.tight_layout()
plt.show()
X = np.array(list(zip(f1, f2)))
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
Own implementation
k_all=(2,3,5) # hier kann man die Anzahl an k, die man vergleichen will, einfach eingeben
runtime_own = np.zeros(len(k_all))
for l in range(len(k_all)):
k=k_all[l]
times_to_repeat = 5
sum_runtime = 0
for j in range (times_to_repeat):
start_time = timeit.default_timer()
Indecies_centroid = np.random.choice(n_samples,size=k) # for initialization we choose some random datapoint in order to avoid that no data points were assigned to random centroid
C=X[Indecies_centroid]
clusters = np.zeros(len(X))
C_old = np.zeros(C.shape)
error = dist(C, C_old) # Konvergenz
from copy import deepcopy
while error.all() != 0:
for i in range(len(X)):
distances = dist(C,[X[i]])
cluster = np.argmin(distances)
clusters[i] = cluster
C_old = deepcopy(C)
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
C[i] = np.mean(points, axis=0)
error = dist(C, C_old)
runtime= timeit.default_timer() - start_time
sum_runtime= sum_runtime+runtime
average_runtime= (sum_runtime/times_to_repeat)
runtime_own[l]=average_runtime
print(runtime_own)
Scitlearn implementation
n_clusters=(2,3,5)
runtime_scikit = np.zeros(len(n_clusters))
for l in range(len(n_clusters)):
times_to_repeat = 10
sum_runtime = 0
for i in range (times_to_repeat):
start_time = timeit.default_timer()
kmeans = KMeans(n_clusters[l]).fit_predict(X)
runtime= timeit.default_timer() - start_time
sum_runtime= sum_runtime+runtime
average_runtime= sum_runtime/times_to_repeat
runtime_scikit[l]=average_runtime
print(runtime_scikit)
Plot
# data to plot
n_groups = len(k_all)
means_own = (runtime_own)
means_scikit = (runtime_scikit)
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8
rects1 = plt.bar(index, means_own, bar_width,
alpha=opacity,
color='b',
label='own implementation')
rects2 = plt.bar(index + bar_width, means_scikit, bar_width,
alpha=opacity,
color='g',
label='Scikit learn implementation')
plt.xlabel('k')
plt.ylabel('average runtime')
plt.title('runtime comparison, blobs , n=' + str(n_samples))
plt.xticks(index + bar_width, (k_all))
plt.legend()
plt.tight_layout()
plt.show()
k_all=(2,3,5) # hier kann man die Anzahl an k, die man vergleichen will, einfach eingeben
runtime_own = np.zeros(len(k_all))
for l in range(len(k_all)):
k=k_all[l]
times_to_repeat = 5
sum_runtime = 0
for j in range (times_to_repeat):
start_time = timeit.default_timer()
Indecies_centroid = np.random.choice(n_samples,size=k) # for initialization we choose some random datapoint in order to avoid that no data points were assigned to random centroid
C=X[Indecies_centroid]
clusters = np.zeros(len(X))
C_old = np.zeros(C.shape)
error = dist(C, C_old) # Konvergenz
from copy import deepcopy
while error.all() != 0:
for i in range(len(X)):
distances = dist(C,[X[i]])
cluster = np.argmin(distances)
clusters[i] = cluster
C_old = deepcopy(C)
for i in range(k):
points = [X[j] for j in range(len(X)) if clusters[j] == i]
C[i] = np.mean(points, axis=0)
error = dist(C, C_old)
runtime= timeit.default_timer() - start_time
sum_runtime= sum_runtime+runtime
average_runtime= (sum_runtime/times_to_repeat)
runtime_own[l]=average_runtime
print(runtime_own)
k_all=(2,3,5) # hier kann man die Anzahl an k, die man vergleichen will, einfach eingeben
runtime_kmplus = np.zeros(len(k_all))
for l in range(len(k_all)):
k=k_all[l]
times_to_repeat = 5
sum_runtime = 0
for j in range (times_to_repeat):
start_time = timeit.default_timer()
C2 = np.random.random((1,2))
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
#"Kmeans part"
clusters = np.zeros(len(X))
C_old = np.zeros(C2.shape)
error = dist(C2, C_old) # convergence
#"Kmeans++ part"
data_2 = pd.DataFrame(X)
data_long = data_2.reset_index().melt(id_vars="index") #add index from probability distribution to data
a = np.array(data_long)
index =a[0:n_samples,0] #define index
Liste =[] #list for minimal distances
k=3
n=0
while n < k - 1:
n=n+1
Liste =[]
for i in range (len(X)):
diff=X[i]-C2
dist = np.linalg.norm(diff,axis=1) # calculate the distance
min_dist= np.min(dist)
Liste.append(min_dist)
s= np.array(Liste)
dist_2 = (s)**2
prob = (dist_2)/ sum (dist_2) #p-distribution
z=np.random.choice(index, p=prob) #add index of new cluster to data
c_neu=X[z] #define datapoint with index z as new center
cl = np.ndarray.tolist(C2) #convert old array to list to be able to add new values
c_neul = np.ndarray.tolist (c_neu) #convert new value to list
cl.append (c_neul) # add new centers to center list
C2 = np.array(cl) #convert list to array
runtime= timeit.default_timer() - start_time
sum_runtime= sum_runtime+runtime
average_runtime= (sum_runtime/times_to_repeat)
runtime_kmplus[l]=average_runtime
print(runtime_kmplus)
# data to plot
n_groups = len(k_all)
means_own = (runtime_own)
means_kmplus = (runtime_kmplus)
# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8
rects1 = plt.bar(index, means_own, bar_width,
alpha=opacity,
color='b',
label='kmenans implementation')
rects2 = plt.bar(index + bar_width, means_kmplus, bar_width,
alpha=opacity,
color='g',
label='kmplus implementation')
plt.xlabel('k')
plt.ylabel('average runtime')
plt.title('runtime comparison, blobs , n=' + str(n_samples))
plt.xticks(index + bar_width, (k_all))
plt.legend()
plt.tight_layout()
plt.show()
adata = sc.read_10x_mtx(
'C:/Users/Computer/Desktop/pbmc3k_filtered_gene_bc_matrices.tar/pbmc3k_filtered_gene_bc_matrices/hg19', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True)
x = adata._X.todense()
x.shape #shape of the data
the value of the ratio of mitochondrial DNA and cellular RNA shows how high or low the quality of a cell is. The higher the proportion of mtDNA, the lower the quality of the data. From a biological point of view, more mitochondrial DNA shows that from a e.g. damaged cell, more "normal" RNA could be lost through perforations, while the larger mtDNA stays in the cell.
sc.pp.filter_cells(adata,min_genes=200)#filter cells with minimum 200 expressed genes
sc.pp.filter_genes(adata,min_cells=3) #filter genes with at least 3 cells the genes are expressed in
mtgenes = adata.var_names.str.startswith('MT-')
adata.obs['mito_percent'] = np.sum(adata[:,mtgenes].X, axis = 1).A1/np.sum(adata.X, axis = 1).A1
adata.obs['n_counts']=adata.X.sum(axis=1).A1
adata = adata[adata.obs['n_genes'] < 2500, :]
adata = adata[adata.obs['mito_percent'] < 0.05, :]
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var['highly_variable']]
adata.shape
we chose 10 components
data_array = np.array(x)
pca=PCA(n_components=10)
pca_data=pca.fit_transform(data_array)
pca_data.shape
plt.scatter(pca_data[:,0], pca_data[:,1], s = 5)
X_data = np.array(pca_data)
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
k=2
Indecies_centroid = np.random.choice(n_samples,size=k)
C=X_data[Indecies_centroid]
plt.scatter(X_data[:,0],X_data[:,1],c='black',s=7)
plt.scatter(C[:, 0], C[:, 1],marker='*',c='red',s=100)
clusters = np.zeros(len(X_data))
C_old = np.zeros(C.shape)
update_centroids = dist(C, C_old) # Konvergenz
while update_centroids.all() != 0:
for i in range(len(X_data)):
distances = dist(C,[X_data[i]])
cluster = np.argmin(distances)
clusters[i] = cluster
C_old = deepcopy(C)
for i in range(k):
points = [X_data[j] for j in range(len(X_data)) if clusters[j] == i]
C[i] = np.mean(points, axis=0)
update_centroids = dist(C, C_old)
fig, ax = plt.subplots()
for i in range(k):
points = np.array([X_data[j] for j in range(len(X_data)) if clusters[j] == i])
ax.scatter(points[:, 0], points[:, 1], s=7)
ax.scatter(C[:, 0], C[:, 1], marker='*', c='black', s=100)
X_data.shape
C2 = np.random.random((1,10))
X_data = np.array(pca_data)
print(C2)
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
#"Kmeans part"
clusters = np.zeros(len(X_data))
C_old = np.zeros(C2.shape)
error = dist(C2, C_old) # convergence
#"Kmeans++ part"
data_2 = pd.DataFrame(X_data)
data_long = data_2.reset_index().melt(id_vars="index") #add index from probability distribution to data
a = np.array(data_long)
index =a[0:len(X_data),0] #define index
Liste =[] #list for minimal distances
k=2 #define number of clusters
n=0
while n < k - 1:
n=n+1
Liste =[]
for i in range (len(X_data)):
diff=X_data[i]-C2
dist = np.linalg.norm(diff,axis=1) # calculate the distance
min_dist= np.min(dist)
Liste.append(min_dist)
s= np.array(Liste)
dist_2 = (s)**2
prob = (dist_2)/ sum (dist_2) #p-distribution
z=np.random.choice(index, p=prob) #add index of new cluster to data
c_neu=X_data[z] #define datapoint with index z as new center
cl = np.ndarray.tolist(C2) #convert old array to list to be able to add new values
c_neul = np.ndarray.tolist (c_neu) #convert new value to list
cl.append (c_neul) # add new centers to center list
C2 = np.array(cl) #convert list to array
cx1= C2[0:((k)),0]
cy1 = C2[0:((k)),1]
plt.scatter(X_data[:,0], X_data[:,1], s = 5)
plt.scatter(cx1,cy1,marker='*',c='black',s=100)
plt.title('Blobs with centers defined by kmeans++ algorithm')
while error.all() != 0:
for i in range(len(X_data)):
diff2=X_data[i]-C2
distances = np.linalg.norm(diff2,axis=1) # calculate the distance
cluster = np.argmin(distances)
clusters[i] = cluster
C_old = deepcopy(C2)
for i in range(k):
points = [X_data[j] for j in range(len(X_data)) if clusters[j] == i]
C2[i] = np.mean(points, axis=0)
diff3 = C2-C_old
error = np.linalg.norm(diff3,axis=1) # calculate the distance
colors = ['r', 'g', 'b', 'y', 'c', 'm']
fig, ax = plt.subplots()
for i in range(k):
points = np.array([X_data[j] for j in range(len(X_data)) if clusters[j] == i])
ax.scatter(points[:, 0], points[:, 1], s=7, c=colors[i])
cx1= C2[0:((k)),0]
cy1 = C2[0:((k)),1]
plt.scatter(cx1,cy1,marker='*',c='black',s=100)
plt.title('Blobs clustered with kmeans++')
C2=[]
adata = sc.read_10x_mtx(
'C:/Users/Computer/Desktop/pbmc3k_filtered_gene_bc_matrices.tar/pbmc3k_filtered_gene_bc_matrices/hg19', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True)
x = adata._X.todense()
x.shape #shape of the data
marker_genes = ['IL7R','CCR7','S100A4','GNLY','NKG7','MS4A1','CD8A',
'FCGR3A','MS4A7','CD14','LYZ','FCER1A','CST3','PPBP']
#create a list with all marker genes
dataframe_total = adata.to_df()
dataframe_markergenes = dataframe_total[marker_genes].copy()
pca=PCA(n_components=14)
pca_markergenes=pca.fit_transform(dataframe_markergenes)
pca_markergenes.shape
plt.scatter(pca_markergenes[:,0], pca_markergenes[:,1], s = 5)
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
X_marker= np.array(pca_markergenes)
def dist(a, b, ax=1):
return np.linalg.norm(a - b, axis=ax)
X_marker = np.array(pca_markergenes)
k=14
Indecies_centroid = np.random.choice(len(X),size=k)
C=X_marker[Indecies_centroid]
plt.scatter(X_marker[:,0],X_marker[:,1],c='black',s=7)
plt.scatter(C[:, 0], C[:, 1],marker='*',c='red',s=100)
clusters = np.zeros(len(X_marker))
C_old = np.zeros(C.shape)
update_centroids = dist(C, C_old) # Konvergenz
while update_centroids.all() != 0:
for i in range(len(X)):
distances = dist(C,[X_marker[i]])
cluster = np.argmin(distances)
clusters[i] = cluster
C_old = deepcopy(C)
for i in range(k):
points = [X_marker[j] for j in range(len(X_marker)) if clusters[j] == i]
C[i] = np.mean(points, axis=0)
update_centroids = dist(C, C_old)
fig, ax = plt.subplots()
for i in range(k):
points = np.array([X_marker[j] for j in range(len(X_marker)) if clusters[j] == i])
ax.scatter(points[:, 0], points[:, 1], s=7)
ax.scatter(C[:, 0], C[:, 1], marker='*', c='black', s=100)